Notes¶
For every CWM, here we compute the position wide sum of CWM contrib scores for the 30 bp and trimmed motifs, to be used as a motif importance metric in downstream analyses.
Run on durga
Setup¶
%load_ext autoreload
%autoreload 2
# deal with matplotlib plots
%matplotlib inline
# display all outputs in a cell
get_ipython().ast_node_interactivity = 'all'
from IPython.display import display, HTML
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
# misc
import sys
import os
import numpy as np
import random
from tqdm import tqdm
import pandas as pd
# io
import h5py
# plotting
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn; seaborn.set_style('white')
import matplotlib.ticker as ticker
import plotnine as pn
import logomaker
import tangermeme
from tangermeme.plot import plot_logo
from tangermeme.utils import reverse_complement
import matplotlib.animation as animation
sys.path.append("..")
from tangermeme_utils.utils import plot_motif
from chrombpnet_utils.modisco import trim_cwm
# editable text in PDFs
# https://stackoverflow.com/a/54111532
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
Variables¶
INPUTLEN = 2114
OUTPUTLEN = 1000
SHIFT_REL_INPUT = np.int64((2114 - 1000) / 2)
SHIFT_REL_INPUT
NARROWPEAK_SCHEMA = ["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"]
LOGO_ALPHABET = 'ACGT'
# finemo colorscheme
LOGO_COLORS = {"A": '#109648', "C": '#255C99', "G": '#F7B32B', "T": '#D62839'}
# logomaker colorscheme
LOGO_COLORS2= {
'G': [1, .65, 0],
'T': [1, 0, 0],
'C': [0, 0, 1],
'A': [0, .5, 0]
}
CHAR_IGNORE = ['QWERYUIOPSDFHJKLZXVBNM']
np.int64(557)
Paths¶
with open("../../DURGA_DIRS.txt", 'r') as f:
proj_in = f.readline().strip()
proj_out = f.readline().strip()
with open("../../AK_PROJ_DIR.txt", 'r') as f:
kundaje_dir = f.readline().strip()
Get modisco objects¶
cluster_meta = pd.read_csv(os.path.join(proj_out, "03-chrombpnet/01-models/qc/chrombpnet_models_keep2.tsv"), sep="\t", header=None)
# set column names
cluster_meta.columns = ["cluster", "folds", "cluster_ID"]
cluster_meta.head()
cluster_meta.shape
| cluster | folds | cluster_ID | |
|---|---|---|---|
| 0 | Adrenal_c0 | fold_0,fold_1,fold_2,fold_3,fold_4 | AG_0 |
| 1 | Adrenal_c1 | fold_0,fold_1,fold_2,fold_3,fold_4 | AG_1 |
| 2 | Adrenal_c2 | fold_0,fold_1,fold_2,fold_3,fold_4 | AG_2 |
| 3 | Adrenal_c3 | fold_0,fold_1,fold_2,fold_3,fold_4 | AG_3 |
| 4 | Brain_c0 | fold_0,fold_1,fold_2,fold_3,fold_4 | BR_0 |
(189, 3)
Inspect summed contrib scores for some examples¶
cluster = "Heart_c0"
path_modisco = proj_out + f"03-chrombpnet/01-models/modisco/bias_Heart_c0_thresh0.4/{cluster}/counts_modisco_output.h5"
modisco = h5py.File(path_modisco, 'r')
modisco.keys()
<KeysViewHDF5 ['neg_patterns', 'pos_patterns']>
cwm = modisco['pos_patterns']['pattern_0']['contrib_scores'][()]
trimmed_cwm, _, _ = trim_cwm(cwm)
trimmed_cwm.shape
plot_motif(trimmed_cwm.T)
trimmed_cwm.sum()
trimmed_cwm.sum()/trimmed_cwm.shape[0]
(14, 4)
np.float64(0.6811012164804402)
np.float64(0.04865008689146001)
cwm = modisco['pos_patterns']['pattern_10']['contrib_scores'][()]
trimmed_cwm, _, _ = trim_cwm(cwm)
trimmed_cwm.shape
plot_motif(trimmed_cwm.T)
trimmed_cwm.sum()
trimmed_cwm.sum()/trimmed_cwm.shape[0]
(14, 4)
np.float64(0.33926301833399025)
np.float64(0.02423307273814216)
cwm = modisco['neg_patterns']['pattern_0']['contrib_scores'][()]
trimmed_cwm, _, _ = trim_cwm(cwm)
trimmed_cwm.shape
plot_motif(trimmed_cwm.T)
trimmed_cwm.sum()
trimmed_cwm.sum()/trimmed_cwm.shape[0]
(13, 4)
np.float64(-0.275456271520475)
np.float64(-0.021188943963113462)
Compute across motifs and cell types¶
# collect a dict of dfs
cwm_means = []
for cluster in cluster_meta['cluster'].values:
print(cluster)
path_modisco = proj_out + f"03-chrombpnet/01-models/modisco/bias_Heart_c0_thresh0.4/{cluster}/counts_modisco_output.h5"
modisco = h5py.File(path_modisco, 'r')
# collect a dict of dfs
for grp in modisco.keys():
for motif in modisco[grp].keys():
# calculate sum of contribs and mean of contribs, across trimmed CWMs
cwm = modisco[grp][motif]['contrib_scores'][()]
trimmed_cwm, _, _ = trim_cwm(modisco[grp][motif]['contrib_scores'][()])
cwm_sum = trimmed_cwm.sum()
cwm_mean = cwm_sum/trimmed_cwm.shape[0]
abs_cwm_sum = np.abs(trimmed_cwm).sum()
abs_cwm_mean = abs_cwm_sum/trimmed_cwm.shape[0]
cwm_means.append([cluster, grp, motif, cwm_sum, abs_cwm_sum, cwm_mean, abs_cwm_mean])
Adrenal_c0 Adrenal_c1 Adrenal_c2 Adrenal_c3 Brain_c0 Brain_c1 Brain_c10 Brain_c11 Brain_c12 Brain_c13 Brain_c14 Brain_c15 Brain_c16 Brain_c17 Brain_c2 Brain_c3 Brain_c4 Brain_c5 Brain_c6 Brain_c7 Brain_c8 Brain_c9 Eye_c0 Eye_c1 Eye_c10 Eye_c11 Eye_c12 Eye_c13 Eye_c14 Eye_c15 Eye_c16 Eye_c17 Eye_c18 Eye_c2 Eye_c3 Eye_c4 Eye_c5 Eye_c6 Eye_c7 Eye_c8 Eye_c9 Heart_c0 Heart_c1 Heart_c10 Heart_c11 Heart_c12 Heart_c13 Heart_c14 Heart_c15 Heart_c2 Heart_c3 Heart_c4 Heart_c5 Heart_c6 Heart_c7 Heart_c8 Heart_c9 Liver_c0 Liver_c1 Liver_c10 Liver_c11 Liver_c12 Liver_c13 Liver_c2 Liver_c3 Liver_c4 Liver_c5 Liver_c6 Liver_c7 Liver_c8 Liver_c9 Lung_c0 Lung_c1 Lung_c10 Lung_c11 Lung_c12 Lung_c13 Lung_c14 Lung_c15 Lung_c16 Lung_c17 Lung_c2 Lung_c3 Lung_c4 Lung_c5 Lung_c6 Lung_c7 Lung_c8 Lung_c9 Muscle_c0 Muscle_c1 Muscle_c10 Muscle_c11 Muscle_c12 Muscle_c13 Muscle_c14 Muscle_c15 Muscle_c16 Muscle_c17 Muscle_c18 Muscle_c19 Muscle_c2 Muscle_c20 Muscle_c21 Muscle_c3 Muscle_c4 Muscle_c5 Muscle_c6 Muscle_c7 Muscle_c8 Muscle_c9 Skin_c0 Skin_c1 Skin_c10 Skin_c11 Skin_c12 Skin_c13 Skin_c14 Skin_c15 Skin_c16 Skin_c17 Skin_c18 Skin_c2 Skin_c3 Skin_c4 Skin_c5 Skin_c6 Skin_c7 Skin_c8 Skin_c9 Spleen_c0 Spleen_c1 Spleen_c10 Spleen_c11 Spleen_c2 Spleen_c3 Spleen_c4 Spleen_c5 Spleen_c6 Spleen_c7 Spleen_c8 Spleen_c9 Stomach_c0 Stomach_c1 Stomach_c10 Stomach_c11 Stomach_c12 Stomach_c13 Stomach_c14 Stomach_c15 Stomach_c16 Stomach_c17 Stomach_c18 Stomach_c19 Stomach_c2 Stomach_c20 Stomach_c21 Stomach_c3 Stomach_c4 Stomach_c5 Stomach_c6 Stomach_c7 Stomach_c8 Stomach_c9 Thymus_c0 Thymus_c1 Thymus_c10 Thymus_c11 Thymus_c12 Thymus_c13 Thymus_c14 Thymus_c15 Thymus_c17 Thymus_c2 Thymus_c3 Thymus_c4 Thymus_c5 Thymus_c6 Thymus_c7 Thymus_c8 Thymus_c9 Thyroid_c0 Thyroid_c1 Thyroid_c2 Thyroid_c3 Thyroid_c4 Thyroid_c5 Thyroid_c6 Thyroid_c7
# convert list to df
cwm_means_df = pd.DataFrame(cwm_means, columns=["cluster", "pattern_class", "motif", "cwm_sum", "abs_cwm_sum", "cwm_mean", "abs_cwm_mean"])
# concatenate two columns
cwm_means_df['component_pattern'] = cwm_means_df['cluster'] + "__" + cwm_means_df['pattern_class'] + "." + cwm_means_df['motif']
cwm_means_df.head()
| cluster | pattern_class | motif | cwm_sum | abs_cwm_sum | cwm_mean | abs_cwm_mean | component_pattern | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adrenal_c0 | pos_patterns | pattern_0 | 0.362828 | 0.366623 | 0.051833 | 0.052375 | Adrenal_c0__pos_patterns.pattern_0 |
| 1 | Adrenal_c0 | pos_patterns | pattern_1 | 0.426939 | 0.429196 | 0.060991 | 0.061314 | Adrenal_c0__pos_patterns.pattern_1 |
| 2 | Adrenal_c0 | pos_patterns | pattern_10 | 0.259504 | 0.260040 | 0.032438 | 0.032505 | Adrenal_c0__pos_patterns.pattern_10 |
| 3 | Adrenal_c0 | pos_patterns | pattern_11 | 0.424052 | 0.440610 | 0.017669 | 0.018359 | Adrenal_c0__pos_patterns.pattern_11 |
| 4 | Adrenal_c0 | pos_patterns | pattern_12 | 0.244701 | 0.245307 | 0.030588 | 0.030663 | Adrenal_c0__pos_patterns.pattern_12 |
# write out
cwm_means_df.to_csv(os.path.join(proj_out, "03-chrombpnet/02-compendium/modisco_compiled/cwm_sums_df.tsv"), sep="\t", index=False)